{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一、加载数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据处理部分：缺失值填充，非数值型标签数值化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             Survived  Pclass  \\\n",
      "PassengerId                     \n",
      "1                   0       3   \n",
      "2                   1       1   \n",
      "3                   1       3   \n",
      "4                   1       1   \n",
      "5                   0       3   \n",
      "...               ...     ...   \n",
      "887                 0       2   \n",
      "888                 1       1   \n",
      "889                 0       3   \n",
      "890                 1       1   \n",
      "891                 0       3   \n",
      "\n",
      "                                                          Name     Sex   Age  \\\n",
      "PassengerId                                                                    \n",
      "1                                      Braund, Mr. Owen Harris    male  22.0   \n",
      "2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   \n",
      "3                                       Heikkinen, Miss. Laina  female  26.0   \n",
      "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   \n",
      "5                                     Allen, Mr. William Henry    male  35.0   \n",
      "...                                                        ...     ...   ...   \n",
      "887                                      Montvila, Rev. Juozas    male  27.0   \n",
      "888                               Graham, Miss. Margaret Edith  female  19.0   \n",
      "889                   Johnston, Miss. Catherine Helen \"Carrie\"  female   NaN   \n",
      "890                                      Behr, Mr. Karl Howell    male  26.0   \n",
      "891                                        Dooley, Mr. Patrick    male  32.0   \n",
      "\n",
      "             SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
      "PassengerId                                                          \n",
      "1                1      0         A/5 21171   7.2500   NaN        S  \n",
      "2                1      0          PC 17599  71.2833   C85        C  \n",
      "3                0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
      "4                1      0            113803  53.1000  C123        S  \n",
      "5                0      0            373450   8.0500   NaN        S  \n",
      "...            ...    ...               ...      ...   ...      ...  \n",
      "887              0      0            211536  13.0000   NaN        S  \n",
      "888              0      0            112053  30.0000   B42        S  \n",
      "889              1      2        W./C. 6607  23.4500   NaN        S  \n",
      "890              0      0            111369  30.0000  C148        C  \n",
      "891              0      0            370376   7.7500   NaN        Q  \n",
      "\n",
      "[891 rows x 11 columns]\n",
      "         Survived      Pclass         Age       SibSp       Parch        Fare\n",
      "count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000\n",
      "mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208\n",
      "std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429\n",
      "min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000\n",
      "25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400\n",
      "50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200\n",
      "75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000\n",
      "max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "titanic = pd.read_csv('train.csv',index_col='PassengerId')\n",
    "print(titanic)\n",
    "#describe()只能统计数据中的数值量\n",
    "print(titanic.describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 二、对数据进行预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "         Survived      Pclass         Age       SibSp       Parch        Fare\n",
      "count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000\n",
      "mean     0.383838    2.308642   29.361582    0.523008    0.381594   32.204208\n",
      "std      0.486592    0.836071   13.019697    1.102743    0.806057   49.693429\n",
      "min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000\n",
      "25%      0.000000    2.000000   22.000000    0.000000    0.000000    7.910400\n",
      "50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200\n",
      "75%      1.000000    3.000000   35.000000    1.000000    0.000000   31.000000\n",
      "max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200\n"
     ]
    }
   ],
   "source": [
    "#由describe()可知，Age的count只有714个是有缺失值的，因此用Age的中位数对此进行填充\n",
    "titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())\n",
    "print(titanic.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['male' 'female']\n"
     ]
    }
   ],
   "source": [
    "#对数据中我们认为重要的非数值型的量，进行数值表示，方便机器进行学习。例如：Sex\n",
    "#先查看Sex中有几类数据\n",
    "print(titanic['Sex'].unique())\n",
    "# loc[行标签，列标签] 按照行列标签进行切片，\n",
    "titanic.loc[titanic['Sex']== 'male' ,'Sex'] = 0\n",
    "titanic.loc[titanic['Sex']== 'female' ,'Sex'] = 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['S' 'C' 'Q' nan]\n"
     ]
    }
   ],
   "source": [
    "#将上船地点进行数值化表示\n",
    "print(titanic['Embarked'].unique())\n",
    "\n",
    "titanic['Embarked'] = titanic['Embarked'].fillna('S')\n",
    "#数值化表示\n",
    "titanic.loc[titanic['Embarked']=='S','Embarked'] = 0\n",
    "titanic.loc[titanic['Embarked']=='C','Embarked'] = 1\n",
    "titanic.loc[titanic['Embarked']=='Q','Embarked'] = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "处理后的数据：\n",
      "              Survived  Pclass  \\\n",
      "PassengerId                     \n",
      "1                   0       3   \n",
      "2                   1       1   \n",
      "3                   1       3   \n",
      "4                   1       1   \n",
      "5                   0       3   \n",
      "\n",
      "                                                          Name  Sex   Age  \\\n",
      "PassengerId                                                                 \n",
      "1                                      Braund, Mr. Owen Harris    0  22.0   \n",
      "2            Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0   \n",
      "3                                       Heikkinen, Miss. Laina    1  26.0   \n",
      "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0   \n",
      "5                                     Allen, Mr. William Henry    0  35.0   \n",
      "\n",
      "             SibSp  Parch            Ticket     Fare Cabin  Embarked  \n",
      "PassengerId                                                           \n",
      "1                1      0         A/5 21171   7.2500   NaN         0  \n",
      "2                1      0          PC 17599  71.2833   C85         1  \n",
      "3                0      0  STON/O2. 3101282   7.9250   NaN         0  \n",
      "4                1      0            113803  53.1000  C123         0  \n",
      "5                0      0            373450   8.0500   NaN         0  \n"
     ]
    }
   ],
   "source": [
    "print('处理后的数据：\\n',titanic.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 三、对数据进行分类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1、利用简单的XGboost机器学习算法，进行分类。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train_shape: (596, 7)\n",
      "X_test_shape: (295, 7)\n",
      "Y_train_shape: (596,)\n",
      "Y_test_shape: (295,)\n",
      "Y_pred:\n",
      " [0 0 0 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0\n",
      " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0\n",
      " 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0\n",
      " 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0\n",
      " 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0\n",
      " 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0\n",
      " 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0\n",
      " 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1]\n",
      "Y_test:\n",
      " [0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 0 1\n",
      " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0\n",
      " 0 0 1 0 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0\n",
      " 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0\n",
      " 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1\n",
      " 0 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0\n",
      " 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 0 1 0 1 1 1 0 0 1 1 0 0\n",
      " 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1]\n",
      "Accuracy:78.98%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.externals import joblib\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "#我们所需要的特征量features\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked']\n",
    "\n",
    "X = titanic.loc[:,features]  #iloc[]切片按行列标签\n",
    "Y = titanic.loc[:,'Survived']\n",
    "#print('数据特征X：\\n',X)\n",
    "#print('数据的标签Y：\\n',Y)\n",
    "\n",
    "#将X 和Y split为训练集train 和测试集test\n",
    "#split X and Y into train and test\n",
    "seed = 7\n",
    "test_size = 0.33 #将数据集中33%的数据用来测试\n",
    "X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)\n",
    "print('X_train_shape:',X_train.shape)\n",
    "print('X_test_shape:',X_test.shape)\n",
    "print('Y_train_shape:',Y_train.shape)\n",
    "print('Y_test_shape:',Y_test.shape)\n",
    "#训练XGBClassifier分类模型\n",
    "model = XGBClassifier(learning_rate=0.001,n_estimators=2500,\n",
    "                                max_depth=4, min_child_weight=0,\n",
    "                                gamma=0, subsample=0.7,\n",
    "                                colsample_bytree=0.7,\n",
    "                                scale_pos_weight=1, seed=27,\n",
    "                                reg_alpha=0.00006).fit(X_train,Y_train)\n",
    "\n",
    "#保存训练的模型\n",
    "joblib.dump(model,'titanic_XGboost_model')\n",
    "\n",
    "#加载模型\n",
    "load_model = joblib.load('titanic_XGboost_model')\n",
    "\n",
    "#用测试集对分类的结果进行预测，看分类模型的准确率\n",
    "Y_pred = load_model.predict(X_test)\n",
    "print('Y_pred:\\n',Y_pred)\n",
    "print('Y_test:\\n',Y_test.values)\n",
    "predictions = [round(value) for value in Y_pred]\n",
    "#accuracy准确率为测试集标签的预测与实际测试集标签之间的正确率，也可以用来看我数据的分类是否合理\n",
    "accuracy = accuracy_score(Y_test,predictions)\n",
    "print('Accuracy:%.2f%%'%(accuracy*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2、利用线性回归LinearRegression,进行分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据的原标签（1表示获救）：\n",
      " [0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]\n",
      "预测的标签（1表示获救）：\n",
      " [1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.\n",
      " 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1.\n",
      " 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1.\n",
      " 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1.\n",
      " 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0.\n",
      " 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0.\n",
      " 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1.\n",
      " 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.\n",
      " 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1.\n",
      " 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0.\n",
      " 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.\n",
      " 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.\n",
      " 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.\n",
      " 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1.\n",
      " 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1.\n",
      " 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1.\n",
      " 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0.\n",
      " 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0.\n",
      " 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1.\n",
      " 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1.\n",
      " 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0.\n",
      " 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.\n",
      " 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1.\n",
      " 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.\n",
      " 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.\n",
      " 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.\n",
      " 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1.\n",
      " 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.\n",
      " 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.\n",
      " 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1.\n",
      " 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.\n",
      " 0. 0. 1.]\n",
      "模型在训练集上的分类的准确度：78.98%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import KFold\n",
    "import numpy as np\n",
    "\n",
    "#我们所需要的特征量features\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked']\n",
    "# print(titanic[features])\n",
    "\n",
    "model = LinearRegression()\n",
    "kf = KFold(n_splits = 10,random_state = 1,shuffle=True)\n",
    "\n",
    "predictions = []\n",
    "\n",
    "for train,test in kf.split(titanic):\n",
    "    train_features = titanic[features].iloc[train,:]\n",
    "    train_label = titanic['Survived'].iloc[train]\n",
    "    model.fit(train_features,train_label)\n",
    "    test_predictions = model.predict(titanic[features].iloc[test,:])\n",
    "    predictions.append(test_predictions)\n",
    "# print(predictions)\n",
    "predictions = np.concatenate(predictions,axis=0)\n",
    "# print(predictions)\n",
    "#predictions中存放的是test样本中预测的概率\n",
    "predictions[predictions>0.5]=1\n",
    "predictions[predictions<=0.5]=0\n",
    "print('数据的原标签（1表示获救）：\\n',titanic['Survived'].to_list())\n",
    "print('预测的标签（1表示获救）：\\n',predictions)\n",
    "accurary = sum(predictions[predictions==titanic['Survived']])/len(predictions)\n",
    "print('模型在训练集上的分类的准确度：%.2f%%'%(accuracy*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3、利用随机森林进行分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型在训练集上的分类准确率：79.01%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked']\n",
    "model = RandomForestClassifier(random_state=1,n_estimators=10,min_samples_split=2,min_samples_leaf=1)\n",
    "# n_estimators=10 随机森林里要构建树的个数  min_samples_split=2 数据最小切分个数  min_samples_leaf=1 叶子结点的最小个数\n",
    "kf = KFold(n_splits=3,random_state=1,shuffle=False)\n",
    "scores = cross_val_score(model,titanic[features],titanic['Survived'],cv=kf)\n",
    "print('模型在训练集上的分类准确率：%.2f%%'%(scores.mean()*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "模型在训练集上的分类准确率：82.38%\n"
     ]
    }
   ],
   "source": [
    "#对随机森林的参数 n_estimators,min_samples_split,min_samples_leaf 简单调节\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked']\n",
    "model = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2)\n",
    "# n_estimators=10 随机森林里要构建树的个数  min_samples_split=2 数据最小切分个数  min_samples_leaf=1 叶子结点的最小个数\n",
    "kf = KFold(n_splits=3,random_state=1,shuffle=False)\n",
    "scores = cross_val_score(model,titanic[features],titanic['Survived'],cv=kf)\n",
    "print('模型在训练集上的分类准确率：%.2f%%'%(scores.mean()*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "4、K近邻算法（既可以用于多分类，也可以用于回归）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Y_pred:\n",
      " [0 0 0 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0\n",
      " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0\n",
      " 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0\n",
      " 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0\n",
      " 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0\n",
      " 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0\n",
      " 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0\n",
      " 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1]\n",
      "Y_test:\n",
      " [0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 0 1\n",
      " 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0\n",
      " 0 0 1 0 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0\n",
      " 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0\n",
      " 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1\n",
      " 0 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0\n",
      " 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 0 1 0 1 1 1 0 0 1 1 0 0\n",
      " 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1]\n",
      "Accuracy:80.00%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier as KNN\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked']\n",
    "X = titanic.loc[:,features]  #iloc[]切片按行列标签\n",
    "Y = titanic.loc[:,'Survived']\n",
    "#print('数据特征X：\\n',X)\n",
    "#print('数据的标签Y：\\n',Y)\n",
    "\n",
    "seed = 7\n",
    "test_size = 0.33 #将数据集中33%的数据用来测试\n",
    "X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)\n",
    "model = KNN().fit(X_train,Y_train)\n",
    "y_pred = model.predict(X_test)\n",
    "\n",
    "print('Y_pred:\\n',Y_pred)\n",
    "print('Y_test:\\n',Y_test.values)\n",
    "predictions = [round(value) for value in Y_pred]\n",
    "#accuracy准确率为测试集标签的预测与实际测试集标签之间的正确率，也可以用来看我数据的分类是否合理\n",
    "accuracy = accuracy_score(Y_test,predictions)\n",
    "print('Accuracy:%.2f%%'%(accuracy*100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 四、当参数优化后，准确率变化不大，遇到瓶颈后，我们要回归数据集中，从中在找到有用的特征。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过上面可以发现准确率有小幅提升，但是似乎得到的结果还是不太理想。我们可以发现模型似乎优化的差不多了，使用的特征似乎也已经使用完了。准确率已经达到了瓶颈，但是如果我们还想提高精度的话，还是要回到最原始的数据集里面。对分类器的结果最大的影响还是输入的数据本身。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "1、新增特征，家庭成员数和名字长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#可能家庭成员的数量对是否获救有关，则对该特征进行提取\n",
    "titanic['FamilySize'] = titanic['SibSp']+titanic['Parch']\n",
    "#可能名字长度对是否获救也有某种潜在的关联\n",
    "titanic['NameLength']=titanic['Name'].apply(lambda x:len(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "提取名字(名字里面包含称呼，如小姐，女士，先生等等)，这些称呼也是有可能对结果产生影响的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PassengerId\n",
      "1        Mr\n",
      "2       Mrs\n",
      "3      Miss\n",
      "4       Mrs\n",
      "5        Mr\n",
      "       ... \n",
      "887     Rev\n",
      "888    Miss\n",
      "889    Miss\n",
      "890      Mr\n",
      "891      Mr\n",
      "Name: Name, Length: 891, dtype: object\n",
      "Mr          517\n",
      "Miss        182\n",
      "Mrs         125\n",
      "Master       40\n",
      "Dr            7\n",
      "Rev           6\n",
      "Col           2\n",
      "Major         2\n",
      "Mlle          2\n",
      "Capt          1\n",
      "Mme           1\n",
      "Countess      1\n",
      "Jonkheer      1\n",
      "Don           1\n",
      "Lady          1\n",
      "Ms            1\n",
      "Sir           1\n",
      "Name: Name, dtype: int64\n",
      "1     517\n",
      "2     183\n",
      "3     125\n",
      "4      40\n",
      "5       7\n",
      "6       6\n",
      "7       5\n",
      "10      3\n",
      "8       3\n",
      "9       2\n",
      "Name: Name, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#使用正则表达式来对名字进行提取\n",
    "import re\n",
    "\n",
    "def get_title(name):\n",
    "    #名字总是由大小写字母组成，并以点号（.）结束\n",
    "    #在name中寻找title\n",
    "    title_search = re.search('([A-Za-z]+)\\.',name)\n",
    "    #如果存在\n",
    "    if title_search:\n",
    "        # 数据集的名字如Todoroff, Mr. Lalio 我们要找出其中的Mr 则返回group(1)\n",
    "        return title_search.group(1)\n",
    "    return ''\n",
    "\n",
    "#获取所有title（如Mr，Miss等） 并对其进行统计个数\n",
    "titles = titanic['Name'].apply(get_title)\n",
    "print(titles)\n",
    "print(pd.value_counts(titles))\n",
    "\n",
    "#将字符标签（Mr，Mrs,Miss等）数值化\n",
    "\n",
    "titles_mapping = { \n",
    "    \"Mr\": 1,\n",
    "    \"Miss\": 2,\n",
    "    \"Mrs\": 3,\n",
    "    \"Master\": 4,\n",
    "    \"Dr\": 5,\n",
    "    \"Rev\": 6,\n",
    "    \"Major\": 7,\n",
    "    \"Col\": 7,\n",
    "    \"Mlle\": 8,\n",
    "    \"Mme\": 8,\n",
    "    \"Don\": 9,\n",
    "    \"Lady\": 10,\n",
    "    \"Countess\": 10,\n",
    "    \"Jonkheer\": 10,\n",
    "    \"Sir\": 9,\n",
    "    \"Capt\": 7,\n",
    "    \"Ms\": 2\n",
    "}\n",
    "\n",
    "for k,v in titles_mapping.items():\n",
    "    titles[k==titles]=v\n",
    "print(pd.value_counts(titles))\n",
    "\n",
    "#给原数据添加title特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "2、新增titles特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "新增特征后的train数据集：\n",
      "              Survived  Pclass  \\\n",
      "PassengerId                     \n",
      "1                   0       3   \n",
      "2                   1       1   \n",
      "3                   1       3   \n",
      "4                   1       1   \n",
      "5                   0       3   \n",
      "\n",
      "                                                          Name  Sex   Age  \\\n",
      "PassengerId                                                                 \n",
      "1                                      Braund, Mr. Owen Harris    0  22.0   \n",
      "2            Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0   \n",
      "3                                       Heikkinen, Miss. Laina    1  26.0   \n",
      "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0   \n",
      "5                                     Allen, Mr. William Henry    0  35.0   \n",
      "\n",
      "             SibSp  Parch            Ticket     Fare Cabin  Embarked  \\\n",
      "PassengerId                                                            \n",
      "1                1      0         A/5 21171   7.2500   NaN         0   \n",
      "2                1      0          PC 17599  71.2833   C85         1   \n",
      "3                0      0  STON/O2. 3101282   7.9250   NaN         0   \n",
      "4                1      0            113803  53.1000  C123         0   \n",
      "5                0      0            373450   8.0500   NaN         0   \n",
      "\n",
      "             FamilySize  NameLength titles  \n",
      "PassengerId                                 \n",
      "1                     1          23      1  \n",
      "2                     1          51      3  \n",
      "3                     0          22      2  \n",
      "4                     1          44      3  \n",
      "5                     0          24      1  \n"
     ]
    }
   ],
   "source": [
    "titanic['titles']=titles\n",
    "print('新增特征后的train数据集：\\n',titanic.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3、对特征的重要性进行选择，特征虽然提取了很多，但并不是每一个都是重要的。，而随机森林的好处就是特征重要性衡量。\n",
    "\n",
    "特征重要性解释：在机器学习的训练过程中，对于多个特征来说，假如要对其中某一个特征来衡量它的重要性，我们就不用这个特征的数据来进行训练，而是把这个特征里面的数据全部替换为噪音数据，假如得到的准确率没有太大的变化，那就说明这个特征其实不那么重要，如果得到的准确率相差太大的话，说明这个特征很重要。其他特征的重要衡量以此类推。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "提取特征后的数据：\n",
      "              Pclass   Age  Sex  SibSp  Parch     Fare  Embarked  FamilySize  \\\n",
      "PassengerId                                                                   \n",
      "1                 3  22.0    0      1      0   7.2500         0           1   \n",
      "2                 1  38.0    1      1      0  71.2833         1           1   \n",
      "3                 3  26.0    1      0      0   7.9250         0           0   \n",
      "4                 1  35.0    1      1      0  53.1000         0           1   \n",
      "5                 3  35.0    0      0      0   8.0500         0           0   \n",
      "\n",
      "             NameLength titles  \n",
      "PassengerId                     \n",
      "1                    23      1  \n",
      "2                    51      3  \n",
      "3                    22      2  \n",
      "4                    44      3  \n",
      "5                    24      1  \n",
      "每个特征的得分列表：\n",
      " [24.59567142  1.27768955 68.85199425  0.5342545   1.82976043 14.21323514\n",
      "  1.72647127  0.20768458 23.69319016 26.98338607]\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAEuCAYAAACXnUm4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAd1ElEQVR4nO3deZhdVZ3u8e9LIqIgylDQEdSARBQHENIM4uMAaqMooAxCo+baeKO3HVBsvaD9aDuj7Ty0bRQx7QiINKitglGcBQpkHi7zIEhKFGVwAt77x9qHnFQqqZOk9j61Ku/neeo5Z+9zTn6rUlXv2WfttdaWbSIioj7rDbsBERGxZhLgERGVSoBHRFQqAR4RUakEeEREpWZ3WWzzzTf33LlzuywZEVG9c88997e2R8bv7zTA586dy+joaJclIyKqJ+n6ifanCyUiolIJ8IiISk0a4JK2l3R+39cfJb1e0qaSzpB0ZXO7SRcNjoiIYtIAt32F7Z1s7wTsAtwNnAIcDSyxPQ9Y0mxHRERHVrcLZW/gatvXA/sDi5v9i4EDprJhERGxaqsb4IcCX23ub2n7FoDmdouJXiBpoaRRSaNjY2Nr3tKIiFjOwAEuaX1gP+Ck1Slge5Ht+bbnj4ysMIwxIiLW0OocgT8XOM/2rc32rZLmADS3S6e6cRERsXKrE+CHsaz7BOA0YEFzfwFw6lQ1KiIiJjfQTExJDwaeDbyyb/exwImSjgBuAA6e+ubF3KO/3XqN647dt/UaETH1Bgpw23cDm43bdxtlVEpERAxBZmJGRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFRqoACX9DBJX5d0uaTLJO0haVNJZ0i6srndpO3GRkTEMoMegX8M+K7txwI7ApcBRwNLbM8DljTbERHRkUkDXNLGwNOA4wBs/9X27cD+wOLmaYuBA9pqZERErGiQI/BtgTHgeEm/kvQ5SRsCW9q+BaC53WKiF0taKGlU0ujY2NiUNTwiYl03SIDPBnYGPm37ycBdrEZ3ie1Ftufbnj8yMrKGzYyIiPEGCfCbgJtsn9Vsf50S6LdKmgPQ3C5tp4kRETGRSQPc9m+AGyVt3+zaG7gUOA1Y0OxbAJzaSgsjImJCswd83muBL0taH7gGeDkl/E+UdARwA3BwO02MiIiJDBTgts8H5k/w0N5T25yIiBhUZmJGRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFRqoKvSS7oOuAO4F7jH9nxJmwInAHOB64BDbP++nWZGRMR4q3ME/kzbO9me32wfDSyxPQ9Y0mxHRERH1qYLZX9gcXN/MXDA2jcnIiIGNWiAGzhd0rmSFjb7trR9C0Bzu8VEL5S0UNKopNGxsbG1b3FERAAD9oEDe9q+WdIWwBmSLh+0gO1FwCKA+fPnew3aGBERExjoCNz2zc3tUuAUYFfgVklzAJrbpW01MiIiVjRpgEvaUNJDeveB5wAXA6cBC5qnLQBObauRERGxokG6ULYETpHUe/5XbH9X0jnAiZKOAG4ADm6vmRERMd6kAW77GmDHCfbfBuzdRqMiImJymYkZEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpgQNc0ixJv5L0rWZ7G0lnSbpS0gmS1m+vmRERMd7qHIEfCVzWt/1+4CO25wG/B46YyoZFRMSqDRTgkrYG9gU+12wL2Av4evOUxcABbTQwIiImNugR+EeBNwP3NdubAbfbvqfZvgnYaqIXSlooaVTS6NjY2Fo1NiIilpk0wCU9H1hq+9z+3RM81RO93vYi2/Ntzx8ZGVnDZkZExHizB3jOnsB+kp4HbABsTDkif5ik2c1R+NbAze01MyIixpv0CNz2Mba3tj0XOBT4ge3DgR8CBzVPWwCc2lorIyJiBWszDvz/AkdJuorSJ37c1DQpIiIGMUgXyv1snwmc2dy/Bth16psUERGDyEzMiIhKJcAjIiqVAI+IqFQCPCKiUgnwiIhKJcAjIiqVAI+IqFQCPCKiUgnwiIhKJcAjIiqVAI+IqFQCPCKiUgnwiIhKJcAjIiqVAI+IqFQCPCKiUgnwiIhKJcAjIiqVAI+IqFQCPCKiUgnwiIhKTRrgkjaQdLakCyRdIukdzf5tJJ0l6UpJJ0hav/3mRkREzyBH4H8B9rK9I7ATsI+k3YH3Ax+xPQ/4PXBEe82MiIjxJg1wF3c2mw9ovgzsBXy92b8YOKCVFkZExIQG6gOXNEvS+cBS4AzgauB22/c0T7kJ2KqdJkZExEQGCnDb99reCdga2BV43ERPm+i1khZKGpU0OjY2tuYtjYiI5azWKBTbtwNnArsDD5M0u3loa+Dmlbxmke35tuePjIysTVsjIqLPIKNQRiQ9rLn/IOBZwGXAD4GDmqctAE5tq5EREbGi2ZM/hTnAYkmzKIF/ou1vSboU+JqkdwO/Ao5rsZ0RETHOpAFu+0LgyRPsv4bSHx4REUOQmZgREZVKgEdEVCoBHhFRqQR4RESlBhmFEhExo809+tut/vvXHbtvK/9ujsAjIiqVAI+IqFQCPCKiUgnwiIhKJcAjIiqVAI+IqFQ1wwjbHuYD7Q31iYhoQ47AIyIqlQCPiKhUNV0oETGz1TobcphyBB4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpSYNcEmPkPRDSZdJukTSkc3+TSWdIenK5naT9psbERE9gxyB3wO80fbjgN2BV0vaATgaWGJ7HrCk2Y6IiI5MGuC2b7F9XnP/DuAyYCtgf2Bx87TFwAFtNTIiIla0Wn3gkuYCTwbOAra0fQuUkAe2mOrGRUTEyg0c4JI2Ak4GXm/7j6vxuoWSRiWNjo2NrUkbIyJiAgMFuKQHUML7y7a/0ey+VdKc5vE5wNKJXmt7ke35tuePjIxMRZsjIoLBRqEIOA64zPaH+x46DVjQ3F8AnDr1zYuIiJUZZDXCPYGXAhdJOr/Z9xbgWOBESUcANwAHt9PEiIiYyKQBbvungFby8N5T25yIiBhUZmJGRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVSoBHRFQqAR4RUakEeEREpRLgERGVGmQ1woh1ytyjv916jeuO3bf1GjHz5Qg8IqJSCfCIiEolwCMiKpUAj4ioVAI8IqJSCfCIiEolwCMiKpUAj4ioVAI8IqJSkwa4pM9LWirp4r59m0o6Q9KVze0m7TYzIiLGG+QI/AvAPuP2HQ0ssT0PWNJsR0REhyYNcNs/Bn43bvf+wOLm/mLggCluV0RETGJN+8C3tH0LQHO7xcqeKGmhpFFJo2NjY2tYLiIixmv9JKbtRbbn254/MjLSdrmIiHXGmgb4rZLmADS3S6euSRERMYg1DfDTgAXN/QXAqVPTnIiIGNQgwwi/CvwC2F7STZKOAI4Fni3pSuDZzXZERHRo0ivy2D5sJQ/tPcVtiYiI1ZCZmBERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUKgEeEVGpBHhERKUS4BERlUqAR0RUatJLqkUMw9yjv916jeuO3bf1GhFtSoBHTCN544rVkQAfQP6oImI6Wqs+cEn7SLpC0lWSjp6qRkVExOTW+Ahc0izgU8CzgZuAcySdZvvSqWpcDFc+eURMb2vThbIrcJXtawAkfQ3YH0iAR1Qob9j1ke01e6F0ELCP7Vc02y8FdrP9mnHPWwgsbDa3B65Y8+auls2B33ZUa7rVT+3UTu2ZVftRtkfG71ybI3BNsG+FdwPbi4BFa1FnjUgatT2/67rToX5qp3Zqz9za/dbmJOZNwCP6trcGbl675kRExKDWJsDPAeZJ2kbS+sChwGlT06yIiJjMGneh2L5H0muA7wGzgM/bvmTKWrb2Ou+2mUb1Uzu1U3vm1r7fGp/EjIiI4cpiVhERlUqAR0RUKgEeEVGpBHhERKVm1GqEkh4N3GT7L5KeATwJ+C/bt3dQe0vgvcDDbT9X0g7AHraP66D2u4B32L6n2d4Y+Jjtl7ddu6n3d5SlFQycY/s3XdTtq78V8Cj6fp9t/7iDugIOB7a1/U5JjwT+zvbZLdb8JhNMmOuxvV9btfva8Bjg08CWtp8g6UnAfrbf3Xbtpv4sYEuW/3nf0HLNDYE/2b6v+f4fC3zH9t/arDuZmXYEfjJwr6TtgOOAbYCvdFT7C5QhlQ9vtv8f8PqOas8GzpL0JEnPoYzRP7eLwpJeAZwNvAg4CPilpH/qonZT//3Az4B/Bd7UfP1LR+X/A9gDOKzZvoOywFubPgh8CLgW+BPw2ebrTuDilmv3fBY4BvgbgO0LKfNAWifptcCtwBnAt5uvb3VQ+sfABs3BwhLg5ZS/+eGyPWO+gPOa2zcBr23u/6qj2ueMrwec3+H3/izKH/TNwHYd1r0C2KxvezPgio7rP7CreuNq937f+n/mF3RU+8eD7Gup9tB+14Gr+n/fhvCzfi3w5vHf/7C+ZtoR+N8kHQYsYNm78gM6qn2XpM1oPt5K2h34QxeFJT0N+BjwTuBM4JOSHr7KF02dmyhHnj13ADd2VBvgGrr7GY/3t+bjfO9nPgLc11HtEUnb9jYkbQOssNhRS37bdFf2vu+DgFs6qn0jHf1djSNJe1C6zHrLNg69C3roDZhiLwdeBbzH9rXNL/WXOqp9FGUpgUdL+hnlj+mgjmp/EDjYzVrskl4E/IDST9e2X1O6b06l/EHvD5wt6SgA2x9uo6ikTzT17gbOl7QE+Evvcduva6PuOB8HTgG2kPQeys/7XzuoC/AG4ExJ1zTbc4FXdlT71ZSZiI+V9GtKd85L2izY+32ivGGfKenbLP/zbuX3rM/rKd1Gp9i+pHnz/GHLNSc1Y2diStoEeIRL/1xXNWdTlswVpRuhkxMckmbZvnfcvs1s39ZB7bev6nHb72ip7oJJ6i5uo+4E7XgssDflZ77E9mVd1G1qP5Blb9KX2/7Lqp7fQv0NgfVs3zHpk9e+1qp+z2z7nW23oWnHhrbv6qLWIGZUgEs6E9iP8snifGAM+JHto1b1uimq/aIJdv8BuMj20pZr90bAbGV7ny5HwIxrxybA7e7wl6oJkT/33sCaLo0H2r675brrARfafkKbdVZR/8GUT32Psv2/Jc0Dtrfd+gk9SfcC/w4c0/tZSzrP9s4d1D7Y9kmT7Wuh7h6UgREb2X6kpB2BV9r+5zbrTmam9YE/1PYfKSMijre9C+XkXheOAD5H6SM7nHKm/ijgZ83FLtr0BcoImDnNdusjYCS9rTn6RNIDJf0AuBq4VVJX/+dQRgQ8qG/7QcD32y5q+z7ggmbo4DAcD/yVMgoGyrmITobxAZdQsuN0SZs2+ya6PkAbjhlw31T7KPAPwG0Ati8AntZB3VWaaX3gsyXNAQ4B3tpx7fuAx9m+Fe4/Kv40sBtlCNIXW6y9ue0TJR0D968Uee9kL1pLLwbe1dxfQPmDHgEeAyymgxBtbGD7zt6G7Tubo9MuzAEukXQ2cP/HancwFht4tO0XNyftsf2nZlx6F+6x/WZJhwA/kfQyVjE2fSpIei7wPGArSR/ve2hj4J42a/fYvnHcf3Hbf2OTmmkB/k7KkehPbZ/TnGi4sqPac3vh3VgKPMb27yS13Rc+jBEwf+3rKvkH4KtNN8ZlzbmArtwlaWfb5wFI2oUynLILrfTvD+ivkh7Esp/5o+k7qdcyATQHDZcAXwXa/iRyMzBK6SLtn+NwB+WEbttulPQUwM31D14HdHa+Y2VmVB/4MEn6D8ovca8v7kDKx9o3Ad+y/cwWa+8MfAJ4AmUyxwhwUJsncCX9EngFZVLFFcAutq9tHrvcdhcjYJA0HziBZVeDmgO82HYnE5mGRdKzKSNedgBOB/YE/pftMzuovUv//28z8/cA2//VQe0HdDU4YFzdzSlDdZ9FeQM7HTiyi4ECq2zXTApwSRtQ+qIfD2zQ22+79ZmBzcfXFwFPbXbdBsyx/eoWa/49cKPt3zRHva+kvHFcCrzN9u9arL0bpatkBPio7Xc1+58HvNT2Yat6/RS1YT1gd8rM097on8s7HP2zO+WN83HA+pQLm9xle+OO6m9G+f4F/NJ2qxfZlbSX7R+s5IQ9tr/RZv2mDRexYnfNHyhH5+8edqB2baYF+EnA5cA/UrpTDgcus31kR/V3amofQhkbe7LtT7ZY7zzgWU03zdOAr1Fmiu1E6Y/vahz60Ej6he09Jn9mK7VHKVPITwLmAy8D5tl+Swe132n7bX3b6wFftH14izXfYfvtko6f4GF3dKD0AUrfc2+JjEMpb2B/AJ5q+wVTXK8332BCHc03WKmZ1ge+ne2DJe1ve7Gkr1D6xFujsrDNoZT1MG6jfJxXm10mfWb1HWW/GFhk+2TgZEnnd1C/dxT4dsonDwM/Bd7Z4ZHQ6ZIOBL7R5fDFHttX9Y3DP17Szzsq/UhJx9h+XzMe/CTgvDYL2n57c9vJImkrsaftPfu2L5L0M9t7SmpjMtFoC//mlJlpAd776Hy7pCcAv6HMUGvT5cBPgBfYvgpAUhcnVQBmSZrtsgrh3sDCvse6+tl+jTLK5sBm+3DKm1hXQwmPAjYE7pH0Z8rRmDvqxri7OaF1fnNkeEvTli68HPhyM/LomZSV8T7SZkFJL6CMfb++2X4b5ed+PaU/+No26zc2krSb7bOaNuwKbNQ8NuWjUXoTwlY2/nyq6602t7jQStdflJNqmwBPp0y5XQq8quWaL6QE1o2Usd97A9d29P2+lbIS36nAr1jWJbYd8LOO2nDuBPtGh/270NH3/ijKuZaNKZ9CPkzLC4kBO/d97UaZsPap3r6Wa18IPLi5/3zKfINdmr+773X0f/73wEWULsrrmjbtSnnjPKTFuucNsq/rrxnVBz5MzYzAAyhdKXtRTvCdYvv0luvuThl5cbqbKb5Nt85GbobWtVz/g5SPmSc2uw4CHu/m43YXmhmg81j+xHVr64FLeqRbXn96FbVXtf6Gbe/VYu0LbO/Y3P88ZbmI9zfbnczE7GvLQykHLK2u9d83/vwQyoFaz8bADrZ3bbP+ZGZEgGvZQjcTcvsL3SynmZ12MGU4W2t/UMMk6Q5Kn7coRz+9SQ2zgDvd3UiMVwBHAltTjkZ3B37RcpDdH1aSTrZ94GSvmeL661EWLzth0idPbd0LgadQFhC7FjjQ9mjz2KW2d+igDQ+kdNvMZfkLOrSyFkozZX4nyqCIt/U9dAfwQ9u/b6PuoGZKH/hDht2Afi4nFj/TfM1ItqfL//mRlI/Vv7T9zGZ6f9sTbPqn42270me1xOWqMK9m+SPCLnyU8ib5R8rorl54P5nulpM9lTLi5Fw6mLjkMmX+Aklf8ZCvvjORGRHgbmnFu1g5SY+1fXkziWgFXXTfNP5s+8+SkPTApk3bt1zTK7nfpTMk/QslxPun8bc29t/25yV9D9gCuKDvod9QTqp2YWvb+3RUC0kn2j4EOE/SCj9r20/qqi0TmREB3iNpMeVs+O3N9ibAh9zB+NR10FGUUS8f6tvX/wveVdfRTZIeBvw3JdR+z7JZmW3ZUdIfKUfiD2ruQ7cjYHq/0/0TxUzLnwhs/1rSx4DPS/qu7ftsd3X0DfBzSU+0fVFH9XpzSC6jzKruEfCBjtqwUjOiD7xH0q9sP3myfbH2muFbN7i5gLHK+twHUkYG/FubR4KraNPTgYcC37X9167rrytUVpt8OeV8w0nAF2xf3lHtSymjrK6ldKH03jRbPRKe6CStpAtzBD611pO0Se/EQnMycaZ9j9PFf9KM9W5mgb6PZbNAF9Hy1YiaZRNeRfljvgg4zvaP2qw53TRzHXZg+dE3ra9HYvv7wPebkSCHUT759IbRfqnlvuLntvhvr0DS/wH+Gdi2OYnb8xDKEN6hmmlH4C8D3kI5KjBl6M97bLe5lOs6adyQsk8BY7b/rdk+3/ZOLdc/gTJx6yeUP+rr3dGSCdOByhVqnkEJ8P+h/B/81B0tn9DMwH0J8FJKl9WXKbNxn2j7GS3XfiplyYLjVa5DupFbmkTUvEltQjlAObrvoTuG8SlzvBkV4AAqV6PZC+6/xNWlQ27SjCTpYmAnl7XHLwcW9sZeS7rYLV+pRtJFtp/Y3J8NnN3lOORhaxZ12pFyZfQdVdaf/5yneC2QldT+BuVSbl+kdJ/c0vfYqO35LdZ+O2Xdme1tP0bl4t0nefnp9euMGdG9MMHH6f90mV4e7fkq8CNJv6Wsv/0TAEnb0c1Vw+//mN68iXRQclr5UzOc8B6V5VyX0t2Qxk/a/sFED7QZ3o0XAk+mWffF9s2SpsuQ1s7NiACnzHrs/zj9OFq+pNi6zvZ7VK4E35sF2vsotx6lL7xtvZEgsPxokC5HggzTaDP65rOUMdF3Ame3WVB9y8hqgiVl3cFysjQXEukN6WtmQK+zZkQXyrr+cTrWbZLmAhu7xQt4NHUmWka2x10M123Gvs8Dnk3pl/4nytWgPr7KF85QMyXAlxvi0/W6DBHD0BwF37+Mr+1ThtykTqhcjeg5lE9b37N9xpCbNDQzJcDvZdlsNFGuTH43687H6VjHqFzCbzvKuQgo68Ff7XavAPUS219a2dpDXa851KNmPfBh1B62GdEHbnvWsNsQ0bGnA0/onXtoZiG3PTux19883U4atn1B5WlrRgR4xDroCkpwXd9sP4KyNnZrbH+muZ1uaw/V342whhLgERWR9E1KYD0UuEzS2c32bkAnl3OTtA1lpNFcll/Sdb8Wa054IWWWdZmukxLgEXX54LAbQFk47Djgm8B9HdVc1QSlb3XUhmlnRpzEjFhXNZN4+o+CW5/eLeks27u1XScmlwCPqJCkhcC7KLNg72PZiKvWZ2NK+kfKWOzT6buoQhdrwDdLBrwXeLjt5zZLZ+xh+7i2a09HCfCICkm6khJcvx1C7fdRFrG6mmVdKO7i8oGSvgMcD7y1WQNmNmU9mCe2XXs6Sh94RJ2upsx1GIYXAtsOac31zW2fKOkYuH8dnHsne9FMlQCPqNMxlKvTnMXy3Riv66D2BcDDKAtode2uZinb3vj33elm8bRpKQEeUafPAD+gTN7paiRIz5bA5ZLOYfk3j9aGEfY5CjgNeLSknwEjtHzxkOksfeARFZL0c9tPGVLtp0+0v6srIjX93ttTTtxeMR2vFt+VBHhEhSS9hzIL85ssfxQ89KvEtEnSLGBfVpxENJR1WIYtAR5RIUkTXUKsq2GEuwOfoKy7vz4wC7iri0XjJP0P8GfGdR1Nw+n9nUgfeESFbG8zxPKfBA6lXHt2PvAyyrjwLmw97CvBTyfrDbsBETE4SW/uu3/wuMfe21U7bF8FzLJ9r+3jKRdY7sJ3JD2no1rTXgI8oi6H9t0/Ztxj+3TUhrslrQ+cL+kDkt7AsqVm2/ZL4BRJf5L0R0l39F1ab52TAI+oi1Zyf6LttryUkh2voVxI5RHAgR3V/hCwB/Bg2xvbfsi6fMGW9IFH1MUruT/R9pSS9EjbN9jurUH+Z6Drk4dXAhc7oy+AjEKJqErf5QP7Lx1Is72B7Qe0WPv+a81KOtl2V0fd/W34ArAt8B2WHz65Tg4jzBF4REWGfPnA/i6a1ocrrsS1zdf6zdc6LQEeEYNaVfdNNw1YR8d7r0y6UCJiIJN037ijiTwjwJuBxwMb9PZ3sZTtdJQj8IgYyJC7b3q+DJwAPB94FbAAGBtqi4YoR+ARUQ1J59reRdKFvRmZkn5ke8IFtma6HIFHRE16Kw/eImlf4GZg6yG2Z6gS4BFRk3dLeijwRsqCWhsDbxhuk4YnXSgREZXKEXhETHuS3raKh237XZ01ZhrJEXhETHuS3jjB7g2BI4DNbG/UcZOmhQR4RFRF0kOAIynhfSLwIdvDuMDy0KULJSKqIGlTykWNDwcWAzvb/v1wWzVcCfCImPYk/TvwImAR8ETbdw65SdNCulAiYtqTdB9l9cF7WH4dls6m8U9HCfCIiErlijwREZVKgEdEVCoBHhFRqQR4RESl/j/4eEl+1UMjGAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.feature_selection import SelectKBest,f_classif #选择最好特征\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "features = ['Pclass','Age','Sex','SibSp','Parch','Fare','Embarked','FamilySize','NameLength','titles']\n",
    "print('提取特征后的数据：\\n',titanic[features].head())\n",
    "\n",
    "#执行特征选择\n",
    "selector = SelectKBest(f_classif,k=5) \n",
    "selector.fit(titanic[features],titanic['Survived']) \n",
    "\n",
    "#获取每个特征的得分\n",
    "scores = -np.log10(selector.pvalues_)\n",
    "print('每个特征的得分列表：\\n',scores)\n",
    "\n",
    "#绘制得分图，看哪些特征对我们来说是重要的\n",
    "plt.bar(range(len(features)),scores)\n",
    "plt.xticks(range(len(features)),features,rotation = 'vertical') #用features中的特征名来代替range(len(features))\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "上图就是特征重要性的一个柱状图，发现Age等一些特征好像影响不大，和刚开始的假设有较大出入，那么这些没用的特征就可以删除掉，只保留有用的特征即可"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 五、集成算法\n",
    "\n",
    "只追求精度的情况下，不考虑时间和资源（通常在机器学习的竞赛中用的比较多），可用多种本质上不同的分类算法进行集成。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "用交叉验证中的每一折test预测的标签：\n",
      " [0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0.\n",
      " 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1.\n",
      " 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0.\n",
      " 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0.\n",
      " 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1.\n",
      " 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.\n",
      " 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.\n",
      " 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
      " 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1.\n",
      " 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.\n",
      " 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1.\n",
      " 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 1.\n",
      " 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.\n",
      " 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.\n",
      " 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0.\n",
      " 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.\n",
      " 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0.\n",
      " 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.\n",
      " 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.\n",
      " 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1.\n",
      " 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0.\n",
      " 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1.\n",
      " 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.\n",
      " 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0.\n",
      " 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.\n",
      " 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0.\n",
      " 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0.\n",
      " 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.\n",
      " 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0.\n",
      " 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.\n",
      " 1. 1. 0.]\n",
      "原始数据的标签：\n",
      " [0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]\n",
      "在训练集上，集成算法的准确性：83.16%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import KFold\n",
    "import numpy as np\n",
    "\n",
    "#组合我们要集成的算法，将多种算法集成为一个模型\n",
    "features = ['Pclass','Age','Sex','Fare','Embarked','FamilySize','NameLength','titles']\n",
    "# features = ['Pclass','Age','Sex','Fare','Embarked','FamilySize','titles']\n",
    "model = [\n",
    "    [GradientBoostingClassifier(random_state=1,n_estimators=1000,min_samples_split=4,min_samples_leaf=2,max_depth=4),features],\n",
    "    [LogisticRegression(random_state = 1,solver = 'liblinear'),features],\n",
    "]\n",
    "\n",
    "#初始化和交叉验证\n",
    "kf = KFold(n_splits=10,shuffle= False,random_state=1)\n",
    "\n",
    "predictions=[]  \n",
    "for train,test in kf.split(titanic):\n",
    "    train_lable = titanic['Survived'].iloc[train]\n",
    "    full_test_predictions = [] #存放的是一折情况下每个模型的预测的概率\n",
    "    #对集成算法里的每个算法都用每折数据进行预测\n",
    "    for model_i,feature in model:\n",
    "        #用train数据对每个模型进行训练\n",
    "        model_i.fit(titanic[features].iloc[train,:],train_lable)\n",
    "        #避免数据类型的错误，都转换成float型\n",
    "        test_predictions = model_i.predict_proba(titanic[features].iloc[test,:].astype(float))[:,1]\n",
    "#         test_predictions = model_i.predict(titanic[features].iloc[test,:])\n",
    "        full_test_predictions.append(test_predictions)\n",
    "    test_predictions = (full_test_predictions[0]+full_test_predictions[1])/2\n",
    "    test_predictions[test_predictions > 0.5] = 1\n",
    "    test_predictions[test_predictions <= 0.5] = 0\n",
    "    predictions.append(test_predictions)\n",
    "\n",
    "#将所有折test数据的预测结果放在一个数组中\n",
    "predictions = np.concatenate(predictions,axis=0)\n",
    "print('用交叉验证中的每一折test预测的标签：\\n',predictions)\n",
    "\n",
    "print('原始数据的标签：\\n',titanic['Survived'].to_list())\n",
    "\n",
    "#与训练集的数据比较得出正确率\n",
    "accuracy = sum(predictions == titanic['Survived'])/len(predictions)\n",
    "print('在训练集上，集成算法的准确性：%.2F%%'%(accuracy*100))\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 六、预测\n",
    "\n",
    "由以上各种分类算法可知，集成算法的分类效果最佳，在对test数据集进行预测，我们选用集成算法。\n",
    "\n",
    "可以将上述过程封装成函数，直接在预测部分调用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             Pclass                                          Name  Sex   Age  \\\n",
      "PassengerId                                                                    \n",
      "892               3                              Kelly, Mr. James    0  34.5   \n",
      "893               3              Wilkes, Mrs. James (Ellen Needs)    1  47.0   \n",
      "894               2                     Myles, Mr. Thomas Francis    0  62.0   \n",
      "895               3                              Wirz, Mr. Albert    0  27.0   \n",
      "896               3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    1  22.0   \n",
      "\n",
      "             SibSp  Parch   Ticket     Fare Cabin  Embarked  FamilySize  \\\n",
      "PassengerId                                                               \n",
      "892              0      0   330911   7.8292   NaN         3           0   \n",
      "893              1      0   363272   7.0000   NaN         0           1   \n",
      "894              0      0   240276   9.6875   NaN         3           0   \n",
      "895              0      0   315154   8.6625   NaN         0           0   \n",
      "896              1      1  3101298  12.2875   NaN         0           2   \n",
      "\n",
      "             NameLength titles  \n",
      "PassengerId                     \n",
      "892                  16      1  \n",
      "893                  32      3  \n",
      "894                  25      1  \n",
      "895                  16      1  \n",
      "896                  44      3  \n",
      "[0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0.\n",
      " 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.\n",
      " 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0.\n",
      " 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.\n",
      " 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.\n",
      " 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0.\n",
      " 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.\n",
      " 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.\n",
      " 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.\n",
      " 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.\n",
      " 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0.\n",
      " 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.\n",
      " 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.\n",
      " 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1.\n",
      " 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1.\n",
      " 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0.\n",
      " 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.]\n",
      "             Survived\n",
      "PassengerId          \n",
      "892                 0\n",
      "893                 0\n",
      "894                 0\n",
      "895                 0\n",
      "896                 1\n",
      "...               ...\n",
      "1305                0\n",
      "1306                1\n",
      "1307                0\n",
      "1308                0\n",
      "1309                0\n",
      "\n",
      "[418 rows x 1 columns]\n",
      "Done\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "titanic_test = pd.read_csv('test.csv',index_col = 'PassengerId')\n",
    "# print('测试数据集：\\n',titanic_test)\n",
    "# print(titanic_test.describe())\n",
    "\n",
    "#发现Age和Fare 有缺失值，进行填补\n",
    "titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test['Age'].median())\n",
    "titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median())\n",
    "# print(titanic_test.describe())\n",
    "\n",
    "#对字符型特征数值化\n",
    "# print(titanic['Sex'].unique())\n",
    "titanic_test.loc[titanic_test['Sex']=='male','Sex'] = 0\n",
    "titanic_test.loc[titanic_test['Sex']== 'female' ,'Sex'] = 1\n",
    "\n",
    "#对字符型特征数值化\n",
    "# print(titanic_test['Embarked'].unique())\n",
    "titanic_test.loc[titanic_test['Embarked']=='S','Embarked'] = 0\n",
    "titanic_test.loc[titanic_test['Embarked']=='C','Embarked'] = 1\n",
    "titanic_test.loc[titanic_test['Embarked']=='Q','Embarked'] = 3\n",
    "# print(titanic_test.head())\n",
    "\n",
    "#对test数据集的其他特征进行处理，添加其他提取的特征\n",
    "\n",
    "#可能家庭成员的数量对是否获救有关，则对该特征进行提取\n",
    "titanic_test['FamilySize'] = titanic_test['SibSp']+titanic_test['Parch']\n",
    "#可能名字长度对是否获救也有某种潜在的关联\n",
    "titanic_test['NameLength']=titanic_test['Name'].apply(lambda x:len(x))\n",
    "\n",
    "#使用正则表达式来对名字进行提取\n",
    "def get_title(name):\n",
    "    #名字总是由大小写字母组成，并以点号（.）结束\n",
    "    #在name中寻找title\n",
    "    title_search = re.search('([A-Za-z]+)\\.',name)\n",
    "    #如果存在\n",
    "    if title_search:\n",
    "        # 数据集的名字如Todoroff, Mr. Lalio 我们要找出其中的Mr 则返回group(1)\n",
    "        return title_search.group(1)\n",
    "    return ''\n",
    "\n",
    "#获取所有title（如Mr，Miss等） 并对其进行统计个数\n",
    "titles = titanic_test['Name'].apply(get_title)\n",
    "# print(titles)\n",
    "# print(pd.value_counts(titles))\n",
    "\n",
    "#将字符标签（Mr，Mrs,Miss等）数值化\n",
    "\n",
    "titles_mapping = { \n",
    "    \"Mr\": 1,\n",
    "    \"Miss\": 2,\n",
    "    \"Mrs\": 3,\n",
    "    \"Master\": 4,\n",
    "    \"Dr\": 5,\n",
    "    \"Rev\": 6,\n",
    "    \"Major\": 7,\n",
    "    \"Col\": 7,\n",
    "    \"Mlle\": 8,\n",
    "    \"Mme\": 8,\n",
    "    \"Dona\": 9,\n",
    "    \"Lady\": 10,\n",
    "    \"Countess\": 10,\n",
    "    \"Jonkheer\": 10,\n",
    "    \"Sir\": 9,\n",
    "    \"Capt\": 7,\n",
    "    \"Ms\": 2\n",
    "}\n",
    "\n",
    "for k,v in titles_mapping.items():\n",
    "    titles[k==titles]=v\n",
    "# print(pd.value_counts(titles))\n",
    "titanic_test['titles']=titles\n",
    "print(titanic_test.head())\n",
    "\n",
    "#使用集成算法预测\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import numpy as np\n",
    "\n",
    "#组合我们要集成的算法，将多种算法集成为一个模型\n",
    "features = ['Pclass','Age','Sex','Fare','Embarked','FamilySize','titles']\n",
    "model = [\n",
    "    [GradientBoostingClassifier(random_state=1,n_estimators=1000,min_samples_split=4,min_samples_leaf=2,max_depth=4),features],\n",
    "    [LogisticRegression(random_state = 1,solver = 'liblinear'),features],\n",
    "]\n",
    "\n",
    "full_predictions = []\n",
    "for model_i,features in model:\n",
    "    # 用整个训练集对模型进行训练.\n",
    "    model_i.fit(titanic[features],titanic['Survived'])\n",
    "    # 使用测试数据集进行预测。我们必须将所有列都转换为浮点数以避免错误.\n",
    "    predictions = model_i.predict_proba(titanic_test[features].astype(float))[:, 1]\n",
    "    predictions[predictions <= .5] = 0\n",
    "    predictions[predictions > .5] = 1\n",
    "    full_predictions.append(predictions)\n",
    "print(predictions)\n",
    "\n",
    "#将预测结果保存为CSV文件\n",
    "test = pd.DataFrame(predictions,index = titanic_test.index,dtype = 'int',columns=['Survived'])\n",
    "print(test)\n",
    "test.to_csv('predictions_Survived.csv')\n",
    "print('Done')\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
